# Tidy data
library(tidyverse)
library(lubridate)
# Supplementary analysis
library(survival)
library(TraMineR)
library(rstatix)
library(epiR)
# plotly
library(plotly)
# ggplot2 and addins
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(gridExtra)
library(ggseqplot)
library(ggh4x)
# Other graphics tools
library(treemap)
# Simulate data for a barplot
dataBarplot <- data.frame(
Episode = rep(c("First Covid-19 episode", "Last Covid-19 episode"), each = 63), # Two Covid episodes
Time = rep(rep(c("Acute phase", "Persisting", "3 months or more"), each = 21), 2), # Symptoms last for a given time
Symptoms = rep(c("Abdominal pain", "Anxiety", "Chest pain", "Cognitive disfunction", "Cough", # List of possible symptoms
"Depression", "Dizziness", "Fatigue", "Fever", "Gastrointestinal problem",
"Headache", "Joint pain", "Loss of smell or taste", "Menstruation issue", "Neuralgia",
"New allergies", "Shortness of breath", "Sleeping disorder", "Tachycardia", "Tinnitus",
"Troubled vision"), 6),
N = c(0, 0, 0, 0, 2, 5, 9, 285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
3985, 4921, 3629, 0, 0, 0, 0, 0, 0, 3, 38, 51, 49, 82, 116, 119, 287, 142, 448,
190, 336, 158, 81, 502, 0, 1, 4, 6, 12, 12, 6, 315, 243, 192, 232, 212, 930, 394,
394, 902, 781, 351, 460, 92, 1318, 0, 0, 0, 0, 0, 0, 0, 71, 62, 119, 148, 162,
217, 123, 427, 347, 520, 669, 825, 730, 574, 0, 0, 0, 0, 1, 0, 0, 3, 7, 13, 7, 10,
22, 8, 13, 40, 15, 34, 18, 17, 36, 0, 1, 1, 2, 1, 2, 2, 61, 67, 59, 51, 58, 85,
217, 84, 200, 172, 79, 18, 115, 296)) %>%
# Total number of cases per symptoms and episode
mutate(Ntot = c(rep(by(N[Episode == "First Covid-19 episode"], Symptoms[Episode == "First Covid-19 episode"], sum), 3),
rep(by(N[Episode == "Last Covid-19 episode"], Symptoms[Episode == "Last Covid-19 episode"], sum), 3)))
# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) +
# Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
geom_col(
mapping = aes(N, Symptoms, fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
width = 0.6, # Set the bar width
position = "stack" # Stack bars by the 'fill' variable
) +
# Create facets (separate panels) for each level of the 'Episode' variable
facet_grid(cols = vars(Episode)) +
# Add a vertical line at x = 0 for reference
geom_vline(aes(xintercept = 0)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 6500), # Set the range for the x-axis
breaks = seq(0, 6000, 1000), # Major tick marks every 1000 units
minor_breaks = seq(500, 6500, 500), # Minor tick marks every 500 units
expand = c(0, 0) # Remove padding on the x-axis
) +
# Add text labels to display 'Ntot' values next to each bar
geom_text(
mapping = aes(Ntot + 25, y = Symptoms, label = Ntot), # Position and content of the text
hjust = 0, # Align text to the left of its position
nudge_x = 1 # Slightly nudge text horizontally
) +
# Add titles and subtitles (empty here for now)
labs(title = "", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray") # Minor grid lines for x-axis
)
# Simulate data for a bar plot
dataBarplot <- data.frame(
# Create a "Status" column with repeated categories (12 repetitions for each category)
Status = rep(c("Family member", "Active worker", "Retired worker"), each = 12),
# Create an "Affection" column, categorizing Covid-19 status, with ordered factor levels
Affection = factor(
rep(rep(c("No Covid-19", "Covid-19", "Long Covid-19"), each = 4), 3),
levels = c("No Covid-19", "Covid-19", "Long Covid-19")
),
# Create a "Social_satisfaction" column, representing satisfaction levels, with ordered factor levels
Social_satisfaction = factor(
rep(c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"), 9),
levels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied")
),
# Define a numeric vector "N" representing counts corresponding to the combinations above
N = c(65, 255, 70, 16, 38, 131, 41, 7, 21, 123, 59, 10,
819, 2532, 591, 130, 640, 1723, 357, 38, 235, 1134, 468, 68,
1444, 4212, 677, 128, 528, 1198, 164, 20, 160, 575, 166, 27)
) %>%
# Add new columns using dplyr's mutate function
mutate(
# Compute the total count (Ntot) for each "Affection" group within each "Status" group
Ntot = c(
# For "Family member", calculate totals per "Affection" group
rep(by(N[Status == "Family member"], Affection[Status == "Family member"], sum), each = 4),
# For "Active worker", calculate totals per "Affection" group
rep(by(N[Status == "Active worker"], Affection[Status == "Active worker"], sum), each = 4),
# For "Retired worker", calculate totals per "Affection" group
rep(by(N[Status == "Retired worker"], Affection[Status == "Retired worker"], sum), each = 4)
),
# Calculate percentages (P) for each count (N) relative to the total count (Ntot) per group
P = N / Ntot * 100
)
# Initialize the plot with the dataset (DATA_ex2_2)
ggplot(dataBarplot) +
# Add a stacked bar chart
geom_col(
mapping = aes(P, Affection, fill = Social_satisfaction), # Map 'P' to x, 'COVID_long_3mod' to y, and 'Q_E1' to fill
width = 0.6, # Set the bar width
position = "stack", # Stack bars by the 'fill' variable
color = "white" # Add white border to bars
) +
# Create facets (separate rows) for each level of the Status variable
facet_grid(rows = vars(Status)) +
# Add vertical lines at x = 0 and x = 100 for reference
geom_vline(aes(xintercept = 0)) +
geom_vline(aes(xintercept = 100)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 100.2), # Set the range for the x-axis
breaks = seq(0, 100, 10), # Major tick marks every 10%
minor_breaks = seq(5, 95, 10), # Minor tick marks every 5%
expand = c(0, 0), # Remove padding on the x-axis
labels = paste0(seq(0, 100, 10), "%") # Append "%" to the tick labels
) +
# Add titles and subtitles (empty here for now)
labs(title = " ", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
# axis.text.x = element_blank(), # Uncomment to hide x-axis labels
# axis.text.y = element_blank(), # Uncomment to hide y-axis labels
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray") # Minor grid lines for x-axis
) +
# Customize the fill legend for the bar chart
scale_fill_manual(
"Social interactions", # Title for the legend
values = rev(RColorBrewer::brewer.pal(4, "RdYlGn")), # Reverse color palette
labels = c( # Define labels for legend items
"Very satisfied", # Very satisfactory
"Satisfied", # Somewhat satisfactory
"Unsatisfied", # Somewhat unsatisfactory
"Very unsatisfied" # Very unsatisfactory
)
)
ymax <- 200
coeff <- 70/200
dataBarplot <- data.frame(
year = 2012:2022,
n_cases = c(120, 87, 78, 96, 117, 153, 139, 164, 181, 188, 163),
pop = rep(300000, 11)
) %>%
mutate(
IR = n_cases / pop * 100000,
IR_lower = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 2] * 100000,
IR_upper = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 3] * 100000
)
ggplot(dataBarplot,
aes(
x = year,
y = n_cases,
width = 0.95
)) +
geom_col(
mapping = aes( fill = "Incidence"),
fill = "dodgerblue" # couleur des barres
) +
geom_line(
aes(
x = year,
y = IR,
colour = "Incidence rate" # Nmo dans la legende
),
linetype = 1,
size = 1,
colour = "black"
) +
geom_errorbar(
aes(
x = year,
ymin = IR_lower,
ymax = IR_upper,
width = 0.2
)
) +# Separation de l'axe y
scale_y_continuous(
expand = c(0,0),
limits = c(0, ymax),
breaks = seq(0, ymax, 25),
name = "Incidence",
sec.axis = sec_axis(
trans = ~.*coeff,
name = "Incidence rate",
breaks = seq(0, ymax*coeff, 10)
)
) +
scale_x_continuous(
breaks = 2012:2022,
labels = 2012:2022,
name = ""
) +
theme_classic() +
theme(
panel.grid.major.y = element_line(colour = "grey"),
panel.grid.minor.y = element_line(colour = "lightgray"),
axis.text.x = element_text(size = 11),
axis.line.y.left = element_line(color = "dodgerblue"),
axis.title.y.left = element_text(color = "dodgerblue"),
axis.text.y.left = element_text(color = "dodgerblue")
) + # Titres
labs(
title = "",
fill = "Nature de l'incident",
x = " ",
y = " "
)
dataCurve <- data.frame(
Incident = c(rep("Severe Covid-19 case", 464), rep("Severe adverse reaction", 76)),
Date = rep(seq.POSIXt(as_datetime("2020-06-01 00:00:00", tz = "CET"),
as_datetime("2023-02-01 00:00:00", tz = "CET"), by = "month"),
c(19, 11, 11, 9, 13, 21, 58, 49, 73, 83, 33, 19, 14 ,13, 16, 11,
3, 3, 11, 26, 8, 2, 4, 3, 8, 1, 4, 2, 4, 4, 2, 1, 1))
)
dataDoses <- data.frame(
Date = seq.Date(as.Date("2021-01-01"), as.Date("2023-01-01"), by = "month"),
Ndoses = c(4573, 5343, 5474, 27699, 73329, 137575, 109853, 62694, 29315, 9604,
10456, 95054, 81274, 17859, 3998, 2385, 2384, 1772, 1931, 1471,
2326, 1872, 7005, 9746, 4759)
)
monthly_breaks <- seq.POSIXt(
from = as_datetime("2020-01-01 00:00:00", tz = "CET"),
to = as_datetime("2023-04-01 00:00:00", tz = "CET"),
by = "month"
)
ymax <- max(table(dataCurve$Date))
coeff <- (140000 / ymax) * (ymax / 80)
dataDoses$Ndoses <- dataDoses$Ndoses / coeff
# Initialize the plot with a dataset (DATA_ex1_1)
ggplot(dataCurve) +
# Add a histogram with dodged bars, colored by the "Incident" variable
geom_histogram(
mapping = aes(x = Date, fill = Incident), # Map 'cas_date' to x-axis, and 'Incident' to fill
breaks = monthly_breaks, # Use specified monthly breaks
closed = "left", # Define intervals as left-closed
color = "white", # Set white border for bars
position = "dodge" # Dodge bars side by side
) +
# Add horizontal grid lines with white color
geom_hline(yintercept = 1:ymax, color = "white") +
# Add vertical reference lines for specific dates
geom_vline(
xintercept = as_datetime(c("2021-01-01 00:00:00", "2022-01-01 00:00:00"), tz = "CET"), # Define dates as datetime objects
color = "lavenderblush4", # Set line color
size = 1 # Set line thickness
) +
# Annotate rectangular periods of interest
annotate(
"rect", # Specify rectangle annotation
xmin = as_datetime("2020-04-01 00:00:00", tz = "CET"), # Start of the rectangle
xmax = as_datetime("2020-06-01 00:00:00", tz = "CET"), # End of the rectangle
ymin = 0, # Rectangle starts at y = 0
ymax = ymax, # Rectangle ends at y = ymax
fill = "gray", # Set fill color to gray
alpha = 0.25 # Set transparency
) +
# Repeat annotation for additional periods
annotate("rect", xmin = as_datetime("2020-11-01 00:00:00", tz = "CET"),
xmax = as_datetime("2021-01-01 00:00:00", tz = "CET"), ymin = 0, ymax = ymax, fill = "gray", alpha = 0.25) +
annotate("rect", xmin = as_datetime("2021-04-01 00:00:00", tz = "CET"),
xmax = as_datetime("2021-06-01 00:00:00", tz = "CET"), ymin = 0, ymax = ymax, fill = "gray", alpha = 0.25) +
# Add a line plot for data in DATA_ex1_2
geom_line(
data = dataDoses,
aes(
x = as_datetime(paste(Date, "00:00:00"), tz = "CET"), # Convert Date to datetime
y = Ndoses, # Use Ndoses for y-axis
colour = "Number of vaccine doses" # Set color legend
),
linetype = 5 # Set line type
) +
# Add points on the line plot
geom_point(
data = dataDoses,
aes(x = as_datetime(paste(Date, "00:00:00"), tz = "CET"), y = Ndoses), # Map Date and Ndoses
colour = "darkslateblue", # Set color
shape = 16, # Use solid circles
size = 2 # Set point size
) +
# Format x-axis as datetime
scale_x_datetime(
expand = c(0, 0), # Remove padding
date_breaks = "month", # Major breaks every month
date_minor_breaks = "month", # Minor breaks also monthly
date_labels = "%m/%y" # Format as MM/YY
) +
# Customize color scale for doses
scale_colour_manual("Doses", values = "darkslateblue") +
# Use classic theme for the plot
theme_classic() +
# Modify various theme elements
theme(
panel.grid.minor.y = element_line(colour = "lightgray", size = 1), # Minor grid lines
panel.grid.major.y = element_line(colour = "lavenderblush4", size = 1), # Major grid lines
axis.text.x = element_text(angle = 90), # Rotate x-axis labels
axis.text.y.right = element_text(color = "darkslateblue"), # Color right y-axis text
axis.line.y.right = element_line(color = "darkslateblue"), # Color right y-axis line
axis.title.y.right = element_text(color = "darkslateblue") # Color right y-axis title
) +
# Format y-axis with a secondary axis
scale_y_continuous(
expand = c(0, 0), # Remove padding
limits = c(0, ymax + 1), # Set y-axis limits
breaks = seq(0, 90, 10), # Major breaks every 10
sec.axis = sec_axis(
trans = ~ . * coeff, # Transform secondary axis
name = "Number of doses dispensed", # Secondary axis title
breaks = floor(seq(0, 140000, length.out = 9)), # Secondary axis breaks
labels = formatC( # Format secondary axis labels
floor(seq(0, 140000, length.out = 9)),
format = "f",
big.mark = " ",
digits = 0
)
)
) +
# Add titles and axis labels
labs(title = "", x = " ", y = "Number of incidents") +
# Add text annotations for specific periods
geom_text(mapping = aes_q(
x = as_datetime("2020-07-01 00:00:00", tz = "CET"), y = 82, label = "Period n°1"
)) +
geom_text(mapping = aes_q(
x = as_datetime("2021-07-01 00:00:00", tz = "CET"), y = 82, label = "Period n°2"
)) +
geom_text(mapping = aes_q(
x = as_datetime("2022-08-01 00:00:00", tz = "CET"), y = 82, label = "Period n°3"
))
# Create a data frame for plotting
dataBoxplot <- data.frame(
med_id = rep(1:8, c(8, 3, 3, 5, 7, 6, 4, 11)), # Medical operator IDs
op_id = c(1:8, 1:3, 1:3, 1:5, 1:7, 1:6, 1:4, 1:11), # Sequence of operation IDs
setup = pmax(10 - rpois(47, lambda = 4), 1) # Simulated 'setup' values, ensuring a minimum of 1
)
# Generate a boxplot of the setup grades by operation ID
boxplot(formula = setup ~ op_id, # Plot setup values grouped by op_id
data = dataBoxplot, # Data source
main = paste0("Surgery setup grade out of 10 (p=",
round(summary(lm(setup ~ op_id, dataBoxplot))$coefficient[2, 4], 4), ")"),
# Title includes p-value from a linear model of setup ~ op_id
ylab = "", # Empty y-axis label
xlab = "Number of surgeries", # Label for x-axis
ylim = c(min(dataBoxplot$setup), max(dataBoxplot$setup) + 1.5), # Adjust y-axis limits
boxwex = 0.7, # Box width
range = 0) # Whiskers include all data within the group
# Overlay mean setup values as points and lines
points(x = 1:max(dataBoxplot$op_id), # X-values: operation IDs
y = by(dataBoxplot$setup, dataBoxplot$op_id, mean), # Y-values: group means of setup
type = "b", # Both points and lines
col = "firebrick3", lwd = 2) # Line color and width
# Add text labels showing the number of surgeries per operation ID
text(x = 1:max(dataBoxplot$op_id), # X-values: operation IDs
y = by(dataBoxplot$setup, dataBoxplot$op_id, max) + 0.75, # Y-values: slightly above group maximums
labels = paste0("(", table(dataBoxplot$op_id), ")")) # Labels: counts of surgeries
# Simulate data for a boxplot
dataBoxplot <- data.frame(
id = rep(1:47, 5), # 47 subjects, repeated 5 times for each group
times = rep(paste0("n°", 1:5), each = 47), # 5 time points
val = c( # Simulate values for each time point with different means and SDs
rnorm(n = 47, mean = 2, sd = 1), # Group 1 (mean=2, sd=1)
rnorm(n = 47, mean = 5, sd = 1), # Group 2 (mean=5, sd=1)
rnorm(n = 47, mean = 5, sd = 2), # Group 3 (mean=5, sd=2)
rnorm(n = 47, mean = 8, sd = 2), # Group 4 (mean=8, sd=2)
rnorm(n = 47, mean = 8, sd = 5) # Group 5 (mean=8, sd=5)
)
)
# Perform ANOVA to test differences between the groups
res.aov <- anova_test(
data = dataBoxplot,
dv = val, # Dependent variable: val (size)
wid = id, # Within-subjects factor: id
between = times # Between-subjects factor: times
)
# Pairwise t-tests to compare each group with group 1, adjusting for multiple comparisons using Bonferroni correction
pwc <- dataBoxplot %>%
pairwise_t_test(
val ~ times, # Dependent variable 'val' across levels of 'times'
paired = TRUE, # Paired samples
ref.group = "n°1", # Use "n°1" as the reference group
p.adjust.method = "bonferroni" # Apply Bonferroni correction to p-values
) %>%
add_xy_position(x = "times") # Add xy-position for displaying p-values
# Create the boxplot with individual data points and p-values
ggboxplot(
dataBoxplot, # Data for the boxplot
x = "times", # Group by 'times'
y = "val", # Plot 'val' (size) on the y-axis
add = "point", # Add individual data points to the boxplot
ylab = "Size", # Label for the y-axis
xlab = "" # No label for the x-axis
) +
stat_pvalue_manual(pwc) + # Add p-values from pairwise t-tests
labs(
subtitle = get_test_label(res.aov, detailed = FALSE), # Subtitle for ANOVA result
caption = "2 by 2 testing: Student's t-test, p-value adjustment: Bonferroni" # Caption explaining test details
)
#### Violins
ggstatsplot::ggbetweenstats(ISLR::Wage, education, wage, "np")
n <- 1000 # Number of individuals
df <- data.frame(
id = 1:n,
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = factor(sample(0:1, n, prob = c(0.4, 0.6), replace = TRUE)), # Event (0 = censored, 1 = sick, 2 = dead)
exposure = sample(c("Unexposed", "Exposed"), n, replace = TRUE) # Exposure status (Unexposed or Exposed)
)
df$time[df$event == 0] <- 365 # Set time to 365 for censored observations (event == 0)
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = df) # Fitting a Survival Model
ymax <- 30
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +
geom_step(aes(linetype = strata, color = event), size = 0.8) +
scale_linetype_manual(name = "Group",
values = c('dotted', 'solid'),
labels = c('Non-exposed', 'Exposed')) +
scale_color_manual(name = "Event",
values = c("white", "#1C4073"),
labels = c(' ', "Hospitalised")) +
labs(x = "Période de suivi (jours)", y = "Proportion", title = "") +
geom_vline(aes(xintercept = 0), size = 1) +
geom_hline(aes(yintercept = 1-ymax/100), size = 1) +
scale_y_reverse(limits = 1-c(ymax/100, 1), breaks = 1-seq(ymax/100, 1, 0.1),
labels = paste0(seq(ymax, 100, 10), "%"), expand = c(0,0)) +
scale_x_continuous(breaks = seq(0, 360, 60), expand = c(0,0)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="black", size = 12),
axis.title.y = element_text(face="bold", colour="black", size = 12),
legend.title = element_text(face="bold", size = 10),
panel.grid.major.y = element_line(colour = "lightgray", size = 0.3),
panel.grid.minor.x = element_blank(),
legend.position = c(0.15, 0.30),
legend.background = element_rect(fill = "transparent", color = "white"))
# Simulating survival data
n <- 1000 # Number of individuals
df <- data.frame(
id = 1:n,
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = factor(sample(0:2, n, prob = c(0.4, 0.5, 0.1), replace = TRUE)), # Event (0 = censored, 1 = sick, 2 = dead)
exposure = sample(c("Unexposed", "Exposed"), n, replace = TRUE) # Exposure status (Unexposed or Exposed)
)
df$time[df$event == 0] <- 365 # Set time to 365 for censored observations (event == 0)
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = df) # Fitting a Survival Model
# Plot a survival curve with `autoplot` and enhance it with additional layers
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +
# Add step lines representing survival curves
geom_step(
aes(linetype = strata, color = event), # Map line type to `strata` and color to `event`
size = 0.8 # Set line thickness
) +
# Customize the line types for survival curves
scale_linetype_manual(
name = "Group", # Title for the legend
values = c('dashed', 'solid'), # Define line types: dashed for unexposed, solid for exposed
labels = c('Unexposed', 'Exposed') # Labels for the legend
) +
# Customize the colors for the event states
scale_color_manual(
name = "State", # Title for the legend
values = c("#AAC0AF", "#1C4073", "#B28B84"), # Colors for Healthy, Sick, and Dead states
labels = c('Healthy', 'Sick', 'Dead') # Labels for the legend
) +
# Add axis labels and a title
labs(
x = "Followup period (days)", # X-axis label
y = "Proportion of the population", # Y-axis label
title = "" # Title (empty for now)
) +
# Add a vertical reference line at x = 0
geom_vline(
aes(xintercept = 0), # Position of the line
size = 1 # Thickness of the line
) +
# Customize the y-axis scale
scale_y_continuous(
limits = c(0, 1), # Set the y-axis range from 0 to 1
breaks = seq(0, 1, 0.1), # Major ticks every 0.1
labels = paste0(seq(0, 100, 10), "%"), # Convert tick labels to percentages
expand = c(0, 0) # Remove padding on the y-axis
) +
# Customize the x-axis scale
scale_x_continuous(
breaks = seq(0, 360, 60), # Major ticks every 60 days
expand = c(0, 0) # Remove padding on the x-axis
) +
# Apply a minimal theme for a clean appearance
theme_minimal() +
# Customize specific theme elements
theme(
plot.title = element_text(hjust = 0.5), # Center-align the plot title
axis.title.x = element_text(
face = "bold", # Make x-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
axis.title.y = element_text(
face = "bold", # Make y-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
legend.title = element_text(face = "bold", size = 10), # Style legend titles
panel.grid.major.y = element_line(
colour = "lightgray", # Light gray major grid lines on the y-axis
size = 0.3 # Set line thickness
),
panel.grid.minor.x = element_blank() # Remove minor grid lines on the x-axis
)
createSankeyData <- function(data, states, timesColumns) {
# Function to prepare data for a Sankey diagram
# Arguments:
# data: A dataframe containing sequential data
# states: A vector of unique states
# timesColumns: A vector of column names representing time steps
data <- as.data.frame(data) # Ensure the input is a dataframe
n_states <- length(states) # Number of unique states
n_times <- length(timesColumns) # Number of time columns (steps)
# Convert time columns into factors with specified levels (states)
for (i in 1:n_times) {
data[, timesColumns[i]] <- factor(data[, timesColumns[i]], levels = states)
}
# Define colors for the nodes (states) and links between states
nodesCols <- c("#AAC0AF", "#B28B84", "#1C4073", "#0f766e", "#653239", "#472C1B", "#5C2751")[1:n_states]
linksCols <- c("#D0DCD3", "#D0B8B4", "#285CA4", "#17B5A7", "#964A54", "#76492D", "#8F3D7E")[1:n_states]
vals <- c() # Initialize a vector to store transition counts
# Calculate transitions between consecutive time steps for each state
for (i in 2:n_times) {
for (j in 1:n_states) {
# Count occurrences of transitions from state j at time (i-1) to all states at time i
vals <- c(vals, table(data[, timesColumns[i]][data[, timesColumns[i - 1]] == states[j]]))
}
}
# Prepare Sankey diagram data structure
dataSankeyTem <- list(
Nodes = data.frame(
X = 1:(n_states * n_times), # Node indices (unique for each state and time step)
label = rep(states, n_times), # Node labels (repeated for each time step)
color = rep(nodesCols, n_times) # Node colors
),
Links = data.frame(
source = c(rep(1:(n_states * (n_times - 1)), each = n_states)) - 1, # Source nodes for transitions
target = as.vector(sapply(split((n_states + 1):(n_states * n_times),
rep(1:(n_times - 1), each = n_states)),
function(x) {rep(x, n_states)})) - 1, # Target nodes for transitions
value = vals, # Transition counts
color = rep(rep(linksCols, each = n_states), n_times - 1) # Colors for links
)
)
}
# Example usage: Create Sankey data for three time steps with predefined states
sankeyData <- createSankeyData(
data.frame(
T1 = sample(
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.95, 0.025, 0.025), replace = TRUE),
T2 = sample(
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.75, 0.125, 0.125), replace = TRUE),
T3 = sample(
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.5, 0.25, 0.25), replace = TRUE)),
c("Initial treatment", "Substitution", "No treatment"), # Define states
c("T1", "T2", "T3") # Define time columns
)
plotSankey <- function(Nodes, Links, add.prop = FALSE) {
# Function to create a Sankey diagram using plotly.
# Args:
# Nodes: Data frame containing node details (labels, colors, etc.).
# Links: Data frame containing link details (source, target, values, colors).
# add.prop: Logical. If TRUE, adds percentage proportions to node labels.
if (add.prop) {
# Check if all sources in Links have an equal number of occurrences
if (all(table(Links$source) == table(Links$source)[1])) {
nrow_per_times <- table(Links$source)[1] ^ 2 # Determine rows per iteration
n_times <- nrow(Links) / nrow_per_times # Calculate number of iterations
for (i in 1:n_times) {
# Extract subset of Links for current iteration
tab <- Links[((i - 1) * nrow_per_times + 1):((i - 1) * nrow_per_times + nrow_per_times), ]
# Calculate proportions for targets
vals <- as.numeric(by(tab$value, tab$target, sum)) # Sum values by target
Nodes[Nodes$X %in% (unique(tab$target) + 1), "label"] <- paste0(
Nodes[Nodes$X %in% (unique(tab$target) + 1), "label"],
" (",
formatC(vals / sum(vals) * 100, digits = 1, format = 'f'), # Format as percentage
"%)"
)
if (i == 1) {
# Calculate proportions for sources during the first iteration
vals <- as.numeric(by(tab$value, tab$source, sum)) # Sum values by source
Nodes[Nodes$X %in% (unique(tab$source) + 1), "label"] <- paste0(
Nodes[Nodes$X %in% (unique(tab$source) + 1), "label"],
" (",
formatC(vals / sum(vals) * 100, digits = 1, format = 'f'), # Format as percentage
"%)"
)
}
}
} else {
warning(
"Links arg does not have equal number of each source and function cannot automatically calculate proportions."
)
}
}
# Convert colors in Links to RGBA format with transparency
Links$color <- apply(grDevices::col2rgb(Links$color), 2, function(x) {
paste0("rgba(", x[1], ",", x[2], ",", x[3], ",0.4)") # Add 40% transparency
})
# Create the Sankey diagram using plotly
fig <- plot_ly(
type = "sankey", # Specify Sankey diagram
orientation = "h", # Horizontal orientation
alpha_stroke = 0.2, # Node border transparency
node = list(
label = Nodes$label, # Node labels
color = Nodes$color, # Node colors
pad = 15, # Padding between nodes
thickness = 20, # Node thickness
line = list(color = "black", width = 0.5) # Node border style
),
link = list(
source = Links$source, # Source nodes for links
target = Links$target, # Target nodes for links
value = Links$value, # Link values
color = Links$color # Link colors (RGBA)
)
)
# Customize the layout of the Sankey diagram
fig <- fig %>% layout(
font = list(
size = 14, # Font size for labels
color = "black", # Font color
weight = "bold" # Font weight
)
)
# Return the generated plot
fig
}
plotSankey(sankeyData$Nodes, sankeyData$Links, add.prop = TRUE)
# Create a dataset simulating treatment sequences
carpetData <- data.frame(
T1 = sample( # Initial treatment stage
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.95, 0.025, 0.025), replace = TRUE
),
T2 = sample( # Second treatment stage
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.75, 0.125, 0.125), replace = TRUE
),
T3 = sample( # Third treatment stage
c("Initial treatment", "Substitution", "No treatment"),
size = 1000, prob = c(0.5, 0.25, 0.25), replace = TRUE
)
) %>%
group_by(T1, T2, T3) %>% # Group data by unique sequences
summarise(w = n()) # Summarize the weight (frequency) of each sequence
# Define a sequence object using the TraMineR package
seqCarpet <- seqdef(
data = carpetData[, c("T1", "T2", "T3")], # Extract sequence columns
weights = carpetData$w, # Use weights for the sequences
cpal = c("#AAC0AF", "#B28B84", "#1C4073"), # Custom color palette for states
cnames = c("T1", "T2", "T3") # Custom names for the sequence stages
)
# Define substitution costs using a constant method
couts <- seqsubm(seqCarpet, method = "CONSTANT", cval = 2)
# Compute optimal matching distances
seq.om <- seqdist(seqCarpet,
method = "OM", # Optimal Matching (OM) method
indel = 1, # Insertion/deletion cost
sm = couts) # Substitution matrix
# Perform hierarchical clustering on the sequence distances
seq.dist <- hclust(as.dist(seq.om), method = "ward.D2")
# Create sequence clusters by cutting the dendrogram into 2 groups
seq_cl <- factor(cutree(seq.dist, 2), labels = c("Class n°1", "Class n°2"))
# Plot individual sequence plots and group sequence plots side by side
ggarrange(
ggseqiplot(seqCarpet, facet_ncol = 1, no.n = TRUE) + # Individual plot
theme(
panel.spacing = unit(0.1, "lines"), # Adjust panel spacing
axis.text.y = element_text(colour = "white"), # Hide y-axis text
axis.ticks.y = element_line(colour = "white") # Hide y-axis ticks
) +
labs(y = ""), # Remove y-axis label
ggseqiplot( # Plot grouped sequences
seqCarpet,
group = seq_cl, # Group by clusters
facet_ncol = 1,
no.n = TRUE
) +
force_panelsizes(rows = table(seq_cl)) + # Adjust panel sizes by group
theme(
panel.spacing = unit(0.1, "lines"), # Adjust panel spacing
axis.text.y = element_text(colour = "white"), # Hide y-axis text
axis.ticks.y = element_line(colour = "white") # Hide y-axis ticks
) +
labs(y = ""), # Remove y-axis label
ncol = 2, # Arrange plots in 2 columns
nrow = 1, # Arrange plots in 1 row
common.legend = TRUE, # Use a common legend
legend = "bottom" # Place legend at the bottom
)
# Create a data frame containing cause of death, details (subcategory), and occurrences
dataTreemap <- data.frame(
cause_of_death = c(
"Cancer",
rep("Non cancerous diseases", 6),
rep("Road accidents", 2),
rep("Other or unknown causes", 4),
"Suicide"
),
details = c(
NA, # Subcategories for "Cancer" are not specified
"Cardiac", # Subcategories for "Non cancerous diseases"
"Respiratory",
"Mental disorders",
"Digestive",
"Endocrinian",
"Other",
"Car", # Subcategories for "Road accidents"
"Other",
"Weapons", # Subcategories for "Other or unknown causes"
"Poorly defined",
"Undefined",
"Other",
NA # Subcategories for "Suicide" are not specified
),
n = c(62, 19, 2, 1, 3, 2, 8, 7, 26, 5, 14, 8, 8, 52) # Occurrences for each category/subcategory
)
# Add the total count (n) for each cause_of_death to its label
dataTreemap <- as.data.frame(t(apply(dataTreemap, 1, function(x) {
x[1] <- paste0(
x[1], " (n=", # Append the total count to the cause_of_death label
sum(dataTreemap$n[dataTreemap$cause_of_death == x[1]]),
")"
)
x
})))
dataTreemap$n <- as.numeric(dataTreemap$n) # Convert the n column back to numeric after transformation
# Generate the treemap
treemap(
dataTreemap,
index = c("cause_of_death", "details"), # Define hierarchical levels for the treemap
vSize = "n", # Size of rectangles corresponds to the n column
type = "index", # Color rectangles based on the index
algorithm = "pivotSize", # Use pivot size layout algorithm for better space optimization
title = "", # No title for the treemap
align.labels = list(c("left", "top"), c("left", "bottom")), # Align labels in blocks
palette = "Pastel1", # Use pastel colors for the treemap
fontsize.title = 22, # Title font size
fontcolor.labels = "black", # Set label font color to black
bg.labels = 0, # Transparent background for labels
aspRatio = 1 # Set aspect ratio to make the blocks square
)